Load in libraries

#-------------------------------------------------------------------------------
# ANALYSIS OF T CELL POLYFUNCTIONALITY DATA
#-------------------------------------------------------------------------------
library(tidyverse)
library(gamlss)
library(emmeans)
library(RColorBrewer)
library(cowplot)
library(writexl)
library(DT)
map <- purrr::map

# functions to run EM for choosing threshold
source("scripts/utils/normal_beta_em_utils.R")
source("scripts/utils/utils.R")
modernacolors()

Read in and process dataset

set.seed(43210)
d <- readxl::read_xlsx("processed_data/polyfunctional_tcell.xlsx") %>%
  dplyr::filter(!str_detect(Stimulation, "PMA")) %>%
  dplyr::filter(!is.na(dose) & dose > 0)

Data thresholding

As described in the Statistical Methods materials, we first need to threshold the polyfunctional T cell data, in order to capture data from the null (i.e., truly zero) and non-null (i.e., some positive proportion of polyfunctional cells in the sample) populations. This is done using a normal-beta mixture model, where the normal distribution, centered at 0, captures the null component, and the beta distribution captures the non-null component.

About preprocessing

Note that we are reporting “Aggregate composition.” This means, among the 3 cytokines we are studying, we are agnostic to which ones are positively expressed. We are only looking for the number of positively expressed cytokines. So, we sum across compositions that are single producers, and sum across compositions that are dual producers, to generate an “aggregate composition” of single and dual producers.

Threshold estimation

The below script estimates these models within each relevant Stimulations/Cell population group. The resulting parameter estimates are displayed. prop0 is the mixing proportion of the null population. mu and sigma are the mean and standard deviation of the normal component. shape1 and shape2 and the shape parameters of the beta distribution.

#-------------------------------------------------------------------------------
# Estimate a distribution for background and signal data using a normal-beta
# mixture model. This is motivated by the SPICE paper. Estimation is done for
# cell type/day specific data. Do estimation using the EM algorithm.
#-------------------------------------------------------------------------------
spl <- group_split(d, cell_population, day) %>%
  purrr::map(~ {
    if(.x$cell_population[1] == "CD4 T") {
      dplyr::filter(.x, Stimulation == "S2")
    } else if(.x$cell_population[1] == "CD8 T") {
      dplyr::filter(.x, Stimulation == "S1")
    }
  })
names(spl) <-
  unlist(group_keys(group_by(d, cell_population, day)) %>% unite(col = nm))
em_fits <-
  purrr::map(spl,
             ~ fit_beta_normal_mixture(.x$agg_composition, mu_equal_zero = TRUE))

purrr::imap_dfr(em_fits,
                ~ as_tibble(.x$parameters) %>% mutate(nm = .y, .before = 1)) %>%
  separate(nm, c("Cell population", "Stimulation"), sep = "_") %>%
  knitr::kable()
Cell population Stimulation prop0 mu sigma shape1 shape2
CD4 T 64 0.0461383 0 0.0005177 2.5741847 2736.2961
CD4 T 87 0.0575217 0 0.0010990 1.6881778 1791.6222
CD4 T 143 0.1394895 0 0.0017161 3.4336328 4276.7515
CD4 T 226 0.1018359 0 0.0009028 2.5371268 6270.7086
CD8 T 64 0.0432076 0 0.0002053 0.5476772 106.3619
CD8 T 87 0.0671753 0 0.0018250 0.6725799 161.8807
CD8 T 143 0.0115484 0 0.0024120 1.1334973 186.8061
CD8 T 226 0.0000000 0 0.0010402 0.9915519 255.5449

Thresholding visualizations

We define the threshold as the point at which observations have over a 95% chance of belonging to the non-null component, based on the model fit. The below plots visualize the estimated threshold (purple dashed line), as well as the histogram of fitted and observed models.

#-------------------------------------------------------------------------------
# Plot estimated mixture distribution vs. the real data
# Calculate model-driven threshold in the process
# Keep as non-zero the observations which have >= 95% chance of belonging to
#   the signal distribution. This is motivated as the appropriate threshold for
#   smaller datasets in the SPICE paper.
#-------------------------------------------------------------------------------
mixture_density_plots <- list()
for (i in seq_along(spl)) {
  sim <-
    sample(
      0:1,
      replace = TRUE,
      size = 10000,
      prob = c(em_fits[[i]]$parameters$prop0, 1 - em_fits[[i]]$parameters$prop0)
    ) %>%
    tibble(sim_dat = c(
      rbeta(
        n = sum(. == 1),
        shape1 = em_fits[[i]]$parameters$shape1,
        shape2 = em_fits[[i]]$parameters$shape2
      ),
      rnorm(
        n = sum(. == 0),
        mean = em_fits[[i]]$parameters$mu,
        sd = em_fits[[i]]$parameters$sigma
      )
    ),
    group = c(rep("non-zero", sum(. == 1)), rep("zero", sum(. == 0)))) %>%
    dplyr::select(-1)
  
  thresh <-
    dplyr::filter(spl[[i]], em_fits[[i]]$z[, "non-zero"] >= .95) %>%
    pull(agg_composition) %>%
    min()
  spl[[i]] <- mutate(spl[[i]], thresh = thresh)
  
  mixture_density_plots[[i]] <-
    ggplot(spl[[i]], aes(x = agg_composition, y = ..density..)) +
    geom_histogram(
      color = "grey40",
      fill = "white",
      bins = 20,
      size = 1.1
    ) +
    geom_density(
      data = filter(sim, group == "zero"),
      aes(
        x = sim_dat,
        y = ..density.. * em_fits[[i]]$parameters$prop0,
        color = group,
        fill = group,
        group = group
      ),
      alpha = 0.3,
      size = 1.1
    ) +
    geom_density(
      data = filter(sim, group == "non-zero"),
      aes(
        x = sim_dat,
        y = ..density.. * (1 - em_fits[[i]]$parameters$prop0),
        color = group,
        fill = group,
        group = group
      ),
      alpha = 0.3,
      size = 1.1
    ) +
    scale_fill_manual(values = c(modernalightgray, modernared)) +
    scale_color_manual(values = c(modernalightgray, modernared)) +
    geom_vline(
      aes(xintercept = thresh),
      color = modernapurple,
      size = 1.2,
      linetype = "twodash"
    ) +
    labs(
      y = "",
      x = "Aggregate composition",
      fill = "Estimated\ndistribution",
      color = "Estimated\ndistribution"
    ) +
    ggtitle(paste0(
      spl[[i]]$cell_population[1],
      " cells, Day ",
      spl[[i]]$day[1],
      ", ",
      spl[[i]]$Stimulation[1]
    )) +
    theme_minimal()
}
leg <- cowplot::get_legend(mixture_density_plots[[1]])
mixture_density_plots <-
  map(mixture_density_plots, ~ .x + theme(legend.position = "none"))
pgrid <-
  cowplot::plot_grid(plotlist = mixture_density_plots,
                     nrow = 2,
                     ncol = 4)
pgrid_out <-
  cowplot::plot_grid(
    pgrid,
    leg,
    nrow = 1,
    ncol = 2,
    rel_widths = c(10, 1)
  )
print(pgrid_out)

The estimated thresholds for each cell population/stimulation combination are displayed below.

#-------------------------------------------------------------------------------
# Table of thresholds 
#-------------------------------------------------------------------------------
# Extract mixture model thresholds
threshd_mix <- bind_rows(spl) %>%
  mutate(
    thresh_agg_composition = case_when(
      agg_composition < thresh ~ 0,
      agg_composition >= thresh ~ agg_composition
    )
  )

thresh_table <-
  dplyr::select(threshd_mix, cell_population, Stimulation, day, thresh) %>%
  arrange(cell_population, Stimulation, day) %>%
  distinct() %>%
  setNames(c("Cell population", "Stimulation", "Day", "threshold"))

knitr::kable(thresh_table)
Cell population Stimulation Day threshold
CD4 T S2 64 0.0002900
CD4 T S2 87 0.0000942
CD4 T S2 143 0.0003194
CD4 T S2 226 0.0000858
CD8 T S1 64 0.0004000
CD8 T S1 87 0.0000000
CD8 T S1 143 0.0001900
CD8 T S1 226 0.0000703

Thresholded data visualization

Supplementary Figure 8: Animal-level Spike-specific CD4+ and CD8+ T-Cell Responses

#-------------------------------------------------------------------------------
# Boxplots of aggregate composition across weeks post dose, by day
#-------------------------------------------------------------------------------
p_boxplot2 <- group_split(threshd_mix, cell_population) %>%
  purrr::map(
    ~ ggplot(.x, aes(x = weeks_post_dose, y = thresh_agg_composition, group = interaction(factor(num_positive), weeks_post_dose))) +
      geom_boxplot(aes(color = factor(num_positive)), outlier.shape = NA) +
      geom_jitter(aes(color = factor(num_positive)), position = position_jitterdodge(), pch = 21) +
      scale_color_manual(values = c(modernaorange, modernablue, modernapurple, modernagreen)) +
      scale_y_continuous(trans = "sqrt") +
      facet_grid(day ~ Stimulation,
                 labeller = labeller("day" = function(.x) paste0("Day ", .x))) +
      labs(x = "Weeks post dose", y = "Aggregate composition", title = paste0(.x$cell_population[1], " cells"), color = "# positive") +
      theme_bw()
  ) 
walk(p_boxplot2, print)

Figure 5: Aggregate trend in T cell polyfunctionality by cell population and peptide pool

#-------------------------------------------------------------------------------
# Stacked bar charts of average (across mice within group) aggregate composition
#   across weeks post dose
#-------------------------------------------------------------------------------
barp <- threshd_mix %>%
  group_split(cell_population, group, weeks_post_dose, day, Stimulation, num_positive) %>%
  map_dfr(~ mutate(.x, agg_composition=mean(agg_composition), mouse_id = NULL) %>% distinct()) %>%
  group_split(cell_population) %>%
  purrr::map(
    ~ ggplot(.x, aes(x = weeks_post_dose, y = thresh_agg_composition, group = interaction(factor(num_positive), weeks_post_dose))) +
      geom_bar(aes(fill = factor(num_positive)), position = "stack", stat = "identity") +
      scale_fill_manual(values = brewer.pal(n = 9, "BuGn")[c(4,6,9)]) +
      facet_grid(day ~ Stimulation,
                 labeller = labeller("num_positive" = function(.x) paste0(.x, " positive"), "day" = function(.x) paste0("Day ", .x))) +
      labs(x = "Weeks post dose", y = "Mean aggregate composition", title = paste0(.x$cell_population[1], " cells"), fill = "# positive") +
      theme_bw()
  )
walk(barp, print)

Model-based group-wise analyses

Regression modeling

We use a zero-inflated beta regression model to model the proportion of single, dual, and triple cytokine expressors. We fit models for CD4+ and CD8+ T cells, and for single, dual, and triple expressors separately, resulting in 6 total models.

#-------------------------------------------------------------------------------
# Multivariate mixed modeling
#-------------------------------------------------------------------------------
d4 <- filter(threshd_mix, cell_population == "CD4 T") %>%
  mutate(agg_composition = NULL) %>%
  pivot_wider(id_cols = everything(), names_from = "num_positive", values_from = "thresh_agg_composition") %>%
  rename("one" = "1", "two" = "2", "three" = "3")
d8 <- filter(threshd_mix, cell_population == "CD8 T") %>%
  mutate(agg_composition = NULL) %>%
  pivot_wider(id_cols = everything(), names_from = "num_positive", values_from = "thresh_agg_composition") %>%
  rename("one" = "1", "two" = "2", "three" = "3")

f4 <- purrr::map(c("one", "two", "three"), ~ {
   if(.x == "one") {
    form <- as.formula(paste0(.x, " ~ factor(day) * factor(weeks_post_dose)"))
  } else {
    form <- as.formula(paste0(.x, " ~ factor(day) + factor(weeks_post_dose)"))  
  }
  sigma_form <- as.formula("~ factor(weeks_post_dose)")
  
  gamlss::gamlss(
    form,
    sigma.formula = sigma_form,
    family = BEZI,
    data = d4,
    trace = FALSE,
    control = gamlss.control(n.cyc = 80, trace = FALSE),
    i.control = glim.control(glm.trace = FALSE, bf.trace = FALSE)
    )
}) %>%
  setNames(c("one", "two", "three"))

f8 <- purrr::map(c("one", "two", "three"), ~ {
  if(.x == "one") {
    form <- as.formula(paste0(.x, " ~ factor(day) * factor(weeks_post_dose)"))
  } else {
    form <- as.formula(paste0(.x, " ~ factor(day) + factor(weeks_post_dose)"))  
  }
  
  sigma_form <- as.formula("~ factor(weeks_post_dose)")
  
  gamlss::gamlss(
    form,
    sigma.formula = sigma_form,
    family = BEZI,
    data = d8,
    trace = FALSE,
    control = gamlss.control(n.cyc = 80, trace = FALSE),
    i.control = glim.control(glm.trace = FALSE, bf.trace = FALSE)
  )
  
}) %>%
  setNames(c("one", "two", "three"))

ds <- ls(pattern = "d[[:digit:]]") %>% lapply(function(x) eval(as.name(x)))
names(ds) <- ls(pattern = "d[[:digit:]]")

fits <- ls(pattern = "f[[:digit:]]") %>% lapply(function(x) eval(as.name(x)))
names(fits) <- ls(pattern = "f[[:digit:]]")

Diagnostics

Residual plots

Since the data are modeled using a generalized linear model assuming a distribution that contains both discrete and continuous components, normalized randomized quantile residuals (RQRs) are used in residual diagnostic plots.

par(mfrow = c(3,2))
iwalk(f4, ~ {
  plot(fitted(.x), residuals(.x), xlab = "Fitted values", ylab = "RQRs", main = paste0("Residual plots for CD4+ T cells with ", .y, "\ncytokine(s) expressed"))
  abline(h = 0, col = "tomato2", lty = 2)
  qqnorm(residuals(.x));
  qqline(residuals(.x))
  })

par(mfrow = c(3,2))
iwalk(f8, ~ {
  plot(fitted(.x), residuals(.x), xlab = "Fitted values", ylab = "RQRs", main = paste0("Residual plots for CD8+ T cells with ", .y, "\ncytokine(s) expressed"))
  abline(h = 0, col = "tomato2", lty = 2)
  qqnorm(residuals(.x));
  qqline(residuals(.x))
  })

Supplementary Figures 9-10: Statistical comparisons of CD4+ and CD8+ T cell cytokine polyfunctionality

#-------------------------------------------------------------------------------
# Differences in consecutive dosing intervals
#-------------------------------------------------------------------------------
em_fits1 <- imap(fits, ~ {
  lapply(.x, function(X) {
    emmeans::emmeans(
      ref_grid(X),
      specs = ~ weeks_post_dose | day
    ) %>%
      contrast(method = "revpairwise", type = "response") %>%
      confint(level = 0.95, adjust = "mvt")
  })
})
names(em_fits1) <- names(fits)

contrast_plots1 <- imap(em_fits1, ~ {
  titles <- names(.x)
  out <- list()
  for(i in seq_along(titles)) {
    out[[i]] <- plot(.x[[i]]) +
      geom_vline(aes(xintercept = 1), linetype = "twodash", color = "red") +
      facet_grid(~ day, labeller = labeller(day = function(.y) paste0("Day ", .y))) +
      labs(x = "Odds ratio", y = "Contrast in dosing interval") +
      ggtitle(paste0("CD", str_extract(.y, "[[:digit:]]"), " T cells, ", titles[i], " function(s)")) +
      theme_bw()
  }
  out
})

walk(contrast_plots1, print)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[1]]

## 
## [[2]]

## 
## [[3]]

Supplementary Table 5: T cell polyfunctionality group comparison P-values

#-------------------------------------------------------------------------------
# Tables of P-values to go with the figure
#-------------------------------------------------------------------------------
em_tables <- imap(fits, ~ {
  lapply(.x, function(X) {
    emmeans::emmeans(
      ref_grid(X),
      specs = ~ weeks_post_dose | day
    ) %>%
      contrast(method = "revpairwise", type = "response", adjust = "mvt")
  })
})

pvals <- imap_dfr(em_tables, ~ {
  imap_dfr(.x, function(.xx, .yy) {
    as_tibble(.xx) %>%
      mutate(num_cytokines = .yy)
  }) %>%
    mutate(tcell = .y)
}) %>%
  dplyr::select(contrast, odds.ratio, day, tcell, num_cytokines, p.value) %>%
  rename(
    "Pairwise comparison" = "contrast",
    "Odds ratio" = "odds.ratio",
    "Day" = "day",
    "Cell" = "tcell",
    "# positive" = "num_cytokines",
    "Adjusted P-value" = p.value
  ) %>%
  mutate(
    Cell = case_when(
      Cell == "f4" ~ "CD4+ T",
      Cell == "f8" ~ "CD8+ T"
      ),
    `# positive` = case_when(
      `# positive` == "one" ~ 1,
      `# positive` == "two" ~ 2,
      `# positive` == "three" ~ 3
  )) %>%
  filter(`Adjusted P-value` <= 0.05)

DT::datatable(pvals)

Session Information

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin19.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /usr/local/Cellar/openblas/0.3.19/lib/libopenblasp-r0.3.19.dylib
## LAPACK: /usr/local/Cellar/r/4.1.2/lib/R/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  splines   stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] mclust_5.4.9       magrittr_2.0.2     DT_0.21            writexl_1.4.0     
##  [5] cowplot_1.1.1      RColorBrewer_1.1-2 emmeans_1.7.4-1    gamlss_5.4-1      
##  [9] nlme_3.1-155       gamlss.dist_6.0-3  MASS_7.3-55        gamlss.data_6.0-2 
## [13] forcats_0.5.1      stringr_1.4.0      dplyr_1.0.9        purrr_0.3.4       
## [17] readr_2.1.2        tidyr_1.2.0        tibble_3.1.7       ggplot2_3.3.6     
## [21] tidyverse_1.3.1   
## 
## loaded via a namespace (and not attached):
##  [1] fs_1.5.2          bit64_4.0.5       lubridate_1.8.0   httr_1.4.2       
##  [5] tools_4.1.2       backports_1.4.1   bslib_0.3.1       utf8_1.2.2       
##  [9] R6_2.5.1          DBI_1.1.2         colorspace_2.0-3  withr_2.5.0      
## [13] tidyselect_1.1.2  bit_4.0.4         compiler_4.1.2    textshaping_0.3.6
## [17] cli_3.3.0         rvest_1.0.2       xml2_1.3.3        sandwich_3.0-1   
## [21] labeling_0.4.2    sass_0.4.0        scales_1.2.0      mvtnorm_1.1-3    
## [25] systemfonts_1.0.4 digest_0.6.29     rmarkdown_2.13    pkgconfig_2.0.3  
## [29] htmltools_0.5.2   dbplyr_2.1.1      fastmap_1.1.0     highr_0.9        
## [33] htmlwidgets_1.5.4 rlang_1.0.2       readxl_1.3.1      rstudioapi_0.13  
## [37] farver_2.1.0      jquerylib_0.1.4   generics_0.1.2    zoo_1.8-9        
## [41] jsonlite_1.8.0    crosstalk_1.2.0   vroom_1.5.7       Matrix_1.4-0     
## [45] Rcpp_1.0.8.3      munsell_0.5.0     fansi_1.0.3       lifecycle_1.0.1  
## [49] stringi_1.7.3     multcomp_1.4-18   yaml_2.3.5        grid_4.1.2       
## [53] crayon_1.5.1      lattice_0.20-45   haven_2.4.3       hms_1.1.1        
## [57] knitr_1.37        pillar_1.7.0      estimability_1.3  codetools_0.2-18 
## [61] reprex_2.0.1      glue_1.6.2        evaluate_0.15     modelr_0.1.8     
## [65] vctrs_0.4.1       tzdb_0.2.0        cellranger_1.1.0  gtable_0.3.0     
## [69] assertthat_0.2.1  xfun_0.30         xtable_1.8-4      broom_0.7.12     
## [73] coda_0.19-4       ragg_1.2.1        survival_3.2-13   TH.data_1.1-0    
## [77] ellipsis_0.3.2